library("readxl")
library(dplyr)
library(ggplot2)
library(tidyverse) 
library(ggthemes)
library(sf)
library(DT)
ks<-read.csv("kickstarter_projects_2021-03.csv")

1. Identifying Successful Projects

a) Success by Category

ks_most_successful_by_state<-ks %>% select(top_category,pledged,goal) %>% mutate(achievement_ratio=round(pledged/goal *100,1)) %>% group_by(top_category
) %>% summarise(mean(achievement_ratio)) %>% arrange(desc(`mean(achievement_ratio)`)) %>% top_n(10) %>% rename(average_achievement_ratio =`mean(achievement_ratio)`)
datatable(ks_most_successful_by_state)
ggplot(ks_most_successful_by_state,aes(x=reorder(top_category,average_achievement_ratio),y=average_achievement_ratio))+geom_bar(stat="identity",fill="orange")+labs(title="Top 10 Most Successful Top Categories in Attracting Funding by Achievement Ratio",caption= "Source:Kickstarter",y="Average Achievement Ratio")+theme_pander()+
  theme(plot.title = element_text(size = 12, face = "bold", hjust = 0.5), 
        axis.text.x = element_text(size=9,face = "bold"), 
        axis.text.y = element_text(size=8, face="bold"),
        axis.title.y = element_text(size=11, face="bold"),
        axis.title.x = element_blank()
        )

Finding: From the above graph, we could see Music is the most successful top category in attracting funding in term of achievement ratio. Top 10 top categories in attracting funding are more likely in entertainment and art area such as music, comics,design, games, publishing, art,film&video, crafts, and fashion.

BONUS ONLY: b) Success by Location

Note: I use the state variable to identify the most innovative states and most innovative cities. Top 10 most innovative states: The states with most projects with states of “successful” Top 50 most innovative cities: The cities with most projects with states of “successful”

# I changed location_state into STUSPS to prepare for the merging with shapefile
total_successful_case_per_state<-ks %>% filter(state=="successful") %>% group_by(location_state) %>% count(location_state) %>% arrange(desc(n)) %>% rename(`Total Successful Cases` = n) %>% rename(STUSPS=location_state)
total_successful_case_per_city<-ks %>% filter(state=="successful") %>% select(location_state,location_town) %>%group_by(location_town) %>% mutate(n=n())%>% arrange(desc(n)) %>% rename(`Total Successful Cases` = n) %>% distinct(location_town,.keep_all = TRUE) 
# To assign rank for cities and states by total nummber of successful cases
most_innovative_state_rank<-data.frame(rank=seq_along(1:51),total_successful_case_per_state)
most_innovative_city_rank<-data.frame(rank=seq_along(1:4552),total_successful_case_per_city)
datatable(most_innovative_state_rank)
# Select the top 50 most innovative cities 
top_50_innovative_city<-most_innovative_city_rank[1:50,]
# This step is to combine the state with city since t here are many cities have same name but in different states. Thus, after combining the state and the city, I could select the accurate locations from the city shapefile.
top_50_innovative_city<-top_50_innovative_city %>% 
  unite(city_state, c("location_town", "location_state"),sep=" ")
# I find the city:Saint Louis MO has different name in Kickstarter dataset and city shapefile, Thus, I make Saint Louis MO the same in the ks dataset as in the shapefile
top_50_innovative_city<-top_50_innovative_city%>% mutate(city_state=replace(city_state, city_state=="St. Louis MO", "Saint Louis MO"))
datatable(top_50_innovative_city)
library(leaflet)
library(rgdal)
require(sp) 
states <- readOGR(dsn = "tl_2017_us_state/tl_2017_us_state.shp")
## OGR data source with driver: ESRI Shapefile 
## Source: "/Users/shiyinglai/Documents/GitHub/Laishiying/hw 03/tl_2017_us_state/tl_2017_us_state.shp", layer: "tl_2017_us_state"
## with 56 features
## It has 14 fields
## Integer64 fields read as strings:  ALAND AWATER
states<-merge(states,most_innovative_state_rank,by="STUSPS")
states@data<-states@data %>% mutate(top10_states = case_when(rank <=10 ~ "Top 10 innovative states"))
states$top10_states[is.na(states$top10_states)]<-"None top 10 innovative states "
#This is how I get the location of most innovative cities from a US City shapefile
# town <- readOGR(dsn = "tufts-uscitiestowns1mil14-shapefile/GISPORTAL_GISOWNER01_USCITIESTOWNS1MIL14.shp")
# town@data<-town@data %>% unite(city_state,c("NAME","STATE"),sep=" ",remove = FALSE)
# city<-merge(town,top_50_innovative_city,by="city_state")
# city@data<-city@data %>% filter(!is.na(rank))
# write.csv(x=city@data, file="city_long_lat")
city_long_lat<-read.csv("city_long_lat")
content_states <- paste("state:",states@data$NAME,"<br/>",
                 "Rank:",states@data$rank,"<br/>"
          )

content_city <- paste("city:",city_long_lat$city_state,"<br/>",
                 "Rank:",city_long_lat$rank,"<br/>"
          )
library(RColorBrewer)
colorsConst <- colorFactor(palette = c("steelblue","lightcyan"), states$top10_states)

I add popups for rank

leaflet() %>%
addProviderTiles("OpenStreetMap.Mapnik") %>%
setView(lat = 37, lng = -95, zoom = 4) %>%
addPolygons(group="Top 10 most innovative states",data =states,stroke = TRUE, smoothFactor = 0.5, weight=1, color='#333333', opacity=1,
           fillColor = ~colorsConst(top10_states), fillOpacity = 1,popup = content_states) %>% addLegend(group="Top 10 most innovative states",
  "bottomright", 
  pal = colorsConst,
  values = states$top10_states,
  title = "Top 10 Innovative States",
opacity = 1, ) %>% 
addCircleMarkers(group="Top 50 most innovative cities",,data=city_long_lat, lng=~LONGITUDE,lat=~LATITUDE,stroke = FALSE, fillOpacity = 1,popup = content_city,color ="orange") %>% addLayersControl(overlayGroups = c("Top 10 most innovative states","Top 50 most innovative cities"), options = layersControlOptions(collapsed = FALSE))

2. Writing your success story

a) Cleaning the Text and Word Cloud

library(tm)
library(tidytext)
clean_corpus <- function(corpus){
  corpus <- tm_map(corpus, removePunctuation)
  # corpus <- tm_map(corpus, content_transformer(removeNumPunct)) # from qdap
  corpus <- tm_map(corpus, content_transformer(tolower))
  # corpus <- tm_map(corpus, content_transformer(replace_symbol)) # from qdap
  corpus <- tm_map(corpus, removeWords, c(stopwords("en")))
    # We could add more stop words as above
  corpus <- tm_map(corpus, removeNumbers)
  corpus <- tm_map(corpus, stripWhitespace)
  return(corpus)
}

Note: The way I selected most successful projects is to select the top 1000 project in backers’ number with a state of “Successful”. 1000 failed projects are a random sample from the projects with a state of “failed”.

a_thousand_successful_projects<-ks %>% filter(state=="successful") %>% arrange(desc(backers_count))
a_thousand_successful_projects<-a_thousand_successful_projects[1:1000,]
a_thousand_failed_projects<-ks %>% filter(state=="failed") %>% sample_n(1000)
successful_whole<-stripWhitespace(paste(c(a_thousand_successful_projects$blurb), collapse=" " ))
failed_whole<-stripWhitespace(paste(c(a_thousand_failed_projects$blurb), collapse=" " ))
# remove the non-english word
successful_whole<-iconv(successful_whole, "latin1", "ASCII", sub="")
failed_whole<-iconv(failed_whole, "latin1", "ASCII", sub="")
# Remove the fully capitalize word
successful_whole<-gsub("\\b[A-Z]+\\b", "",successful_whole)
failed_whole<-gsub("\\b[A-Z]+\\b", "",failed_whole)
# combine the success and failed
whole<-c(successful_whole,failed_whole)
whole_tm<-VCorpus(VectorSource(whole))
#clean the corpus 
whole_clean<-clean_corpus(whole_tm)
whole_dtm <- TermDocumentMatrix(whole_clean)
whole_m <- as.matrix(whole_dtm)
colnames(whole_m)[1] <- "successful_whole"
colnames(whole_m)[2] <- "failed_whole"

The document-term-matrix

datatable(whole_m)
success_tm<-VCorpus(VectorSource(successful_whole))
success_clean<-clean_corpus(success_tm)
success_dtm <- TermDocumentMatrix(success_clean)
success_tidy <- tidy(success_dtm)
success_tf_idf <-  success_tidy %>%
                bind_tf_idf(term, document, count) %>%
                arrange(desc(tf))

The measure I choose here is tf.

I only have one document, thus the idf and tf-idf is zero

datatable(success_tf_idf)
library(wordcloud)
set.seed(11)
wordcloud(success_tf_idf$term, success_tf_idf$tf,
         max.words = 100, colors = c("skyblue","orange"))

b) Success in words

common <- subset(whole_m, whole_m[, 1] > 0 &whole_m[, 2] > 0)
head(common)
##              Docs
## Terms         successful_whole failed_whole
##   abandoned                  2            2
##   access                     4            3
##   accessible                 3            3
##   accessories                2            1
##   achieve                    2            1
##   acoustic                   1            2
difference <- abs(common[, 1] - common[, 2])
common<- cbind(common, difference)
common <- common[order(common[, 3], decreasing = TRUE), ]
top15 <- data.frame(successed = common[1:15, 1],
                       failed = common[1:15, 2],
                       term = rownames(common[1:15, ]))
datatable(top15)
library(plotrix)
p <- pyramid.plot(top15$successed, top15$failed, labels = top15$term,
             gap = 10, top.labels = c("Successed", " ", "Failed"),
             main = "Words Frenquency Difference for Successful Projects and Failed Projects", laxlab = NULL,
             raxlab = NULL, unit = NULL, labelcex=0.8)

## 111 111

c) Simplicity as a virtue

require(quanteda)
s_fk<-textstat_readability(a_thousand_successful_projects$blurb,
              measure=c('Flesch.Kincaid',"Flesch"))

Flesh Reading Ease, Flesh Kincaid for 1000 most successful projects

datatable(s_fk)
f_fk<-textstat_readability(a_thousand_failed_projects$blurb,
              measure=c('Flesch.Kincaid',"Flesch"))

Flesh Reading Ease, Flesh Kincaid for 1000 failed projects

datatable(f_fk)
s_fk<-cbind(a_thousand_successful_projects,s_fk)
f_fk<-cbind(a_thousand_failed_projects,f_fk)
whole_fk<-rbind(s_fk,f_fk)
ggplot(whole_fk,aes(x=`Flesch.Kincaid`, y=backers_count))+geom_point()+geom_smooth(method='lm',se = FALSE)+ coord_cartesian(ylim = c(0, 92000))+scale_x_continuous(breaks =seq(0,30,3))+scale_y_continuous(breaks =seq(0,92000,10000))+labs(title="The relatonship between Flesch.Kincaid Score and backers number for 2000 projects",y="backers number",x="Flesch.Kincaid Score",caption= "Source:Kickstarter")+theme_pander()+
  theme(plot.title = element_text(size = 12, face = "bold", hjust = 0.5), 
        axis.text.x = element_text(size=14, face="bold"), 
        axis.text.y = element_text(size=10, face="bold"), 
        axis.title.y = element_text(size=14, face="bold"),
        axis.title.x = element_text(size=14, face="bold"),
        legend.title = element_blank())
## `geom_smooth()` using formula 'y ~ x'

From the above graph, we could see there is no obvious relationship between Flesch.Kincaid Score and projects’backers number for these 2000 projects

s_count<-s_fk %>% group_by(`Flesch.Kincaid`) %>% summarise(mean(backers_count))
s_count
## # A tibble: 443 x 2
##    Flesch.Kincaid `mean(backers_count)`
##  *          <dbl>                 <dbl>
##  1         -3.40                   7037
##  2         -2.88                   2733
##  3         -1.84                   5205
##  4         -0.670                  9690
##  5         -0.572                  2705
##  6          0.500                  5160
##  7          0.517                  2233
##  8          0.626                  2566
##  9          0.805                  2209
## 10          0.819                  2482
## # … with 433 more rows
ggplot(s_count,aes(x=`Flesch.Kincaid`, y=`mean(backers_count)`))+geom_point()+geom_smooth(method='lm',se = FALSE)+ coord_cartesian(ylim = c(2000, 32000))+scale_x_continuous(breaks =seq(-4,20,3))+scale_y_continuous(breaks =seq(2000,32000,2000))+labs(title="Average Backers Number Per Flesch.Kincaid Score Group in 1000 most successful projects",y="average backers number",x="Flesch.Kincaid Score",caption= "Source:Kickstarter")+theme_pander()+
  theme(plot.title = element_text(size = 10, face = "bold", hjust = 0.5), 
        axis.text.x = element_text(size=14, face="bold"), 
        axis.text.y = element_text(size=10, face="bold"), 
        axis.title.y = element_text(size=14, face="bold"),
        axis.title.x = element_text(size=14, face="bold"),
        legend.title = element_blank())
## `geom_smooth()` using formula 'y ~ x'

I create above graph only using the 1000 most successful projects by eliminating three extreme outliers of average backer number. The values are 39560,91585, and 88887. We could see after I eliminating the outliers, the Flesch.Kincaid Score and average back number has a slightly positive relationship. As a blurb becomes harder to read, the average backer number slightly increases for the most successful 1000 projects

3. Sentiment

a) Stay positive

Note: The part a does show the scope of analysis. Thus, I use the 2000 projects in Q2 here.

pos <- read.table("positive-words.txt", as.is=T)
neg <- read.table("negative-words.txt", as.is=T)
sentiment <- function(words=c("really great good stuff bad")){
  require(quanteda)
  tok <- quanteda::tokens(words)
  pos.count <- sum(tok[[1]]%in%pos[,1])
  neg.count <- sum(tok[[1]]%in%neg[,1])
  out <- (pos.count - neg.count)/(pos.count+neg.count)
  return(out)
}
whole_df<-rbind(a_thousand_successful_projects,a_thousand_failed_projects)
result=lapply(tolower(whole_df$blurb),sentiment)
sentiment<-as.data.frame(do.call(rbind, result))
whole_sentiment<-cbind(whole_df,sentiment)
whole_sentiment<-whole_sentiment %>% rename(sentiment=V1)
#
whole_sentiment_sameple<-whole_sentiment %>% select(blurb,sentiment)

A sample of 50 of Sentiment Analysis for 2000 Projects

datatable(whole_sentiment_sameple[1:50,])

The total number of successful cases per sentiment score groups

ggplot(whole_sentiment,aes(x=sentiment, y=backers_count,col=state))+geom_point()+scale_color_manual(values = c("failed" = "steelblue", "successful" = "orange"))+geom_smooth(method='lm',se = FALSE)+coord_cartesian(ylim = c(0, 92000))+scale_x_continuous(breaks =seq(-1.1,1.1,0.1))+scale_y_continuous(breaks =seq(0,92000,10000))+labs(title="Projects' Backers Number VS Sentiment Score for 2000 Projects",y="A Project'S Backer Number",x="Sentiment Score",caption= "Source:Kickstarter")+theme_pander()+
  theme(plot.title = element_text(size = 12, face = "bold", hjust = 0.5), 
        axis.text.x = element_text(size=10,face="bold"), 
        axis.text.y = element_text(size=12, face="bold"), 
        axis.title.y = element_text(size=10, face="bold"),
        axis.title.x = element_text(size=10, face="bold"),
        legend.title = element_blank())

From the above graph we could see, among the most successful project, sentiment score has slightly positive relationship with a project’s backers number. However, among failed projects, there is no obvious relationship between sentiment score and a project’s backers’ number.

b) Positive vs negative

Note: I treat sentiment score greater than zero as positive, I treat sentiment score smaller than zero as negative. A sentiment score with NA are ignored here.

positive<-whole_sentiment %>% filter(sentiment>0)
negative<-whole_sentiment %>% filter(sentiment<0)
positve_all<-paste(positive$blurb, collapse=" ")
negative_all<-paste(negative$blurb, collapse=" ")
# remove Non-English Word
positve_all<-iconv(positve_all, "latin1", "ASCII", sub="")
negative_all<-iconv(negative_all, "latin1", "ASCII", sub="")
# Remove the fully capitalize word
positive_all<-gsub("\\b[A-Z]+\\b", "",positve_all)
negative_all<-gsub("\\b[A-Z]+\\b", "",negative_all)
positive_negative=c(positive_all,negative_all)
positive_negative_tm <- VCorpus(VectorSource(positive_negative))
positive_negative_clean<-clean_corpus(positive_negative_tm )
positive_negative_dtm <- TermDocumentMatrix(positive_negative_clean)
positive_negative_m<-as.matrix(positive_negative_dtm)
colnames(positive_negative_m)[1] <- "positive"
colnames(positive_negative_m)[2] <- "negative"

The Document Term Matrix

datatable(positive_negative_m)
comparison.cloud(positive_negative_m, colors = c("orange", "steelblue"), scale=c(0.1,2), title.size
= 2, max.words = 100)

c) Get in their mind

Note: I am using the 2000 Projects in Q2

nrc<-get_sentiments("nrc")
# remove none english word
whole_df$blurb<-iconv(whole_df$blurb, "latin1", "ASCII", sub="")
# Remove the fully capitalize word
whole_df$blurb<-gsub("\\b[A-Z]+\\b", "",whole_df$blurb)
whole_df_tm<-VCorpus(VectorSource(whole_df$blurb))
whole_df_tm_clean<-clean_corpus(whole_df_tm)
whole_df_tm_clean_df <- data.frame(text = sapply(whole_df_tm_clean, paste, collapse = " "), stringsAsFactors = FALSE)
nrc_anger<-nrc %>% filter(sentiment=="anger")
nrc_anticipation<-nrc %>% filter(sentiment=="anticipation")
nrc_disgust<-nrc %>% filter(sentiment=="disgust")
nrc_fear<-nrc %>% filter(sentiment=="fear")
nrc_joy<-nrc %>% filter(sentiment=="joy")
nrc_sadness<-nrc %>% filter(sentiment=="sadness")
nrc_surprise<-nrc %>% filter(sentiment=="surprise")
nrc_trust<-nrc %>% filter(sentiment=="trust")
nrc_dict<-dictionary(list(anger = nrc_anger$word,
                anticipation = nrc_anticipation$word,
                disgust=nrc_disgust$word,
                fear =nrc_fear$word,
                joy=nrc_joy$word,
                sadness=nrc_sadness$word,
                surprise=nrc_surprise$word,
                trust=nrc_trust$word
                ))
nrc_dfm <- dfm(whole_df_tm_clean_df$text, dictionary = nrc_dict)
nrc_dfm_m<-as.matrix(nrc_dfm)

The DFM for NRC Sentiment

datatable(nrc_dfm_m)
require(reshape2)

nrc_dfm_df<-melt(nrc_dfm_m)
nrc_dfm_df<-nrc_dfm_df %>% mutate(docs = str_remove(docs, "text")) %>% mutate(docs=as.numeric(docs))
whole_df<-whole_df %>% mutate(docs=1:2000)
whole_df_with_nrc_score<-merge(nrc_dfm_df,whole_df,all=TRUE)
head(whole_df_with_nrc_score,1)
##   docs features value backers_count
## 1    1    anger     0         91585
##                                                                                                                    blurb
## 1 : This is it. We're making a Veronica Mars movie! Now the only question is: how big can we make it? We need your help!
##   converted_pledged_amount country country_displayable_name created_at currency
## 1                  5702153     USA        the United States 2011-11-04      USD
##     deadline  goal         id is_starrable launched_at
## 1 2013-04-12 2e+06 1755266685        FALSE  2013-03-13
##                              name pledged                            slug
## 1 The Veronica Mars Movie Project 5702153 the-veronica-mars-movie-project
##                                                                          source_url
## 1 https://www.kickstarter.com/discover/categories/film%20&%20video/narrative%20film
##   spotlight staff_pick      state state_changed_at location_town location_state
## 1      TRUE       TRUE successful       2013-04-12     San Diego             CA
##   top_category   sub_category
## 1 film & video narrative film
success_failed_nrc_score<- whole_df_with_nrc_score %>% group_by(state,features,.drop=FALSE) %>% summarise(mean(value)) %>% filter(state=="successful"|state=="failed")
## `summarise()` has grouped output by 'state'. You can override using the `.groups` argument.
success_failed_nrc_score
## # A tibble: 16 x 3
## # Groups:   state [2]
##    state      features     `mean(value)`
##    <chr>      <fct>                <dbl>
##  1 failed     anger                0.196
##  2 failed     anticipation         0.66 
##  3 failed     disgust              0.105
##  4 failed     fear                 0.28 
##  5 failed     joy                  0.827
##  6 failed     sadness              0.329
##  7 failed     surprise             0.321
##  8 failed     trust                0.737
##  9 successful anger                0.277
## 10 successful anticipation         0.595
## 11 successful disgust              0.154
## 12 successful fear                 0.323
## 13 successful joy                  0.505
## 14 successful sadness              0.252
## 15 successful surprise             0.282
## 16 successful trust                0.643
ggplot(success_failed_nrc_score, aes(x = reorder(features,`mean(value)`),
                  y = `mean(value)`, fill = state)) +
  geom_bar(data = filter(success_failed_nrc_score, state == "successful"), aes(y=`mean(value)`,fill=state),stat = "identity") +
  geom_bar(data = filter(success_failed_nrc_score, state == "failed"),aes(y=-`mean(value)`,fill=state), stat = "identity") +scale_y_continuous(breaks=seq(-1,1,0.5), labels=c(1,0.5,0,0.5,1))+ylim(-1,1)+labs(title="Differences in Average NRC Sentiment Value for Successful and Failed Projects",caption= "Source:Kickstarter",y="Average NRC Sentiment Score")+
  scale_fill_manual(values = c("steelblue", "orange")) +theme_pander()+  
  theme(plot.title = element_text(size = 12, face = "bold", hjust = 0.5), 
        axis.text.x = element_text(face="bold",size=12), 
        axis.text.y = element_text(face = "bold",size=10),                                                                      
        axis.title.x = element_text(size=12),
        axis.title.y = element_blank(),
        legend.title = element_blank( ),
        legend.position = "top")+coord_flip()

From the above table, we could see, among the 1000 failed projects, joy has the highest average score, while among most successful projects, trust has the highest average score. In addtion, for both successful and failed projects, disgust has the lowest average score. lastly, Joy has the biggest difference between most successful project and failed projects.